PostgresoL Blog

Hope these queries can also help you in your day to day quest to make Postgres run better!The first three queries rank top SQL from pg_stat_statements either by execution count, mean_exec_time and tot...

Hope these queries can also help you in your day to day quest to make Postgres run better!

The first three queries rank top SQL from pg_stat_statements either by execution count, mean_exec_time and total_exec_time. I love to use these queries to get a quick glance of what I should focus on tuning. Given that pg_stat_statements tracks lots of things, you can filter out certain “query text” as necessary so that those do not influence the ranking.

Top SQL by Mean Exec Time

WITH
hist AS (
SELECT queryid::text,
       SUBSTRING(query from 1 for 1000) query,
       ROW_NUMBER () OVER (ORDER BY mean_exec_time::numeric DESC) rn,
       SUM(mean_exec_time::numeric) mean_exec_time
  FROM pg_stat_statements
 WHERE queryid IS NOT NULL
        AND query::text not like '%pg\_%'
        AND query::text not like '%g\_%'
        /* Add more filters here */
 GROUP BY
       queryid,
       SUBSTRING(query from 1 for 1000),
       mean_exec_time::numeric
),
total AS (
SELECT SUM(mean_exec_time::numeric) mean_exec_time FROM hist
)
SELECT DISTINCT
       h.queryid::text,
       ROUND(h.mean_exec_time::numeric,3) mean_exec_time,
       ROUND(100 * h.mean_exec_time / t.mean_exec_time, 1) percent,
       h.query
  FROM hist h,
       total t
 WHERE h.mean_exec_time >= t.mean_exec_time / 1000 AND rn <= 14
 UNION ALL
SELECT 'Others',
       ROUND(COALESCE(SUM(h.mean_exec_time), 0), 3) mean_exec_time,
       COALESCE(ROUND(100 * SUM(h.mean_exec_time) / AVG(t.mean_exec_time), 1), 0) percent,
       NULL sql_text
  FROM hist h,
       total t
 WHERE h.mean_exec_time < t.mean_exec_time / 1000 OR rn > 14
 ORDER BY 3 DESC NULLS LAST;

Top SQL by Total Exec Time

WITH
hist AS (
SELECT queryid::text,
       SUBSTRING(query from 1 for 100) query,
       ROW_NUMBER () OVER (ORDER BY total_exec_time::numeric DESC) rn,
       SUM(total_exec_time::numeric) total_exec_time
  FROM pg_stat_statements
 WHERE queryid IS NOT NULL
        AND query::text not like '%pg\_%'
        AND query::text not like '%g\_%'
        /* Add more filters here */
 GROUP BY
       queryid,
       SUBSTRING(query from 1 for 100),
       total_exec_time::numeric
),
total AS (
SELECT SUM(total_exec_time::numeric) total_exec_time FROM hist
)
SELECT DISTINCT
       h.queryid::text,
       ROUND(h.total_exec_time::numeric,3) total_exec_time,
       ROUND(100 * h.total_exec_time / t.total_exec_time, 1) percent,
       h.query
  FROM hist h,
       total t
 WHERE h.total_exec_time >= t.total_exec_time / 1000 AND rn <= 14
 UNION ALL
SELECT 'Others',
       ROUND(COALESCE(SUM(h.total_exec_time::numeric), 0), 3) total_exec_time,
       COALESCE(ROUND(100 * SUM(h.total_exec_time) / AVG(t.total_exec_time), 1), 0) percent,
       NULL sql_text
  FROM hist h,
       total t
 WHERE h.total_exec_time < t.total_exec_time / 1000 OR rn > 14
 ORDER BY 3 DESC NULLS LAST;

Top SQL by Execution Count

WITH
hist AS (
SELECT queryid::text,
       SUBSTRING(query from 1 for 100) query,
       ROW_NUMBER () OVER (ORDER BY calls DESC) rn,
       calls
  FROM pg_stat_statements 
 WHERE queryid IS NOT NULL
        AND query::text not like '%pg\_%'
        AND query::text not like '%g\_%'
        /* Add more filters here */
 GROUP BY
       queryid,
       SUBSTRING(query from 1 for 100),
       calls
),
total AS (
SELECT SUM(calls) calls FROM hist
)
SELECT DISTINCT
       h.queryid::text,
       h.calls,
       ROUND(100 * h.calls / t.calls, 1) percent,
       h.query
  FROM hist h,
       total t
 WHERE h.calls >= t.calls / 1000 AND rn <= 14
 UNION ALL
SELECT 'Others',
       COALESCE(SUM(h.calls), 0) calls,
       COALESCE(ROUND(100 * SUM(h.calls) / AVG(t.calls), 1), 0) percent,
       NULL sql_text
  FROM hist h,
       total t
 WHERE h.calls < t.calls / 1000 OR rn > 14
 ORDER BY 2 DESC NULLS LAST;

Object Sizes Including Toast

Shows total size for a table including a break down for its index and toast size.

SELECT
  *,
  pg_size_pretty(table_bytes) AS table,
  pg_size_pretty(toast_bytes) AS toast,
  pg_size_pretty(index_bytes) AS index,
  pg_size_pretty(total_bytes) AS total
FROM (
  SELECT
    *, total_bytes - index_bytes - COALESCE(toast_bytes, 0) AS table_bytes
  FROM (
    SELECT
      c.oid,
      nspname AS table_schema,
      relname AS table_name,
      c.reltuples AS row_estimate,
      pg_total_relation_size(c.oid) AS total_bytes,
      pg_indexes_size(c.oid) AS index_bytes,
      pg_total_relation_size(reltoastrelid) AS toast_bytes
    FROM
      pg_class c
      LEFT JOIN pg_namespace n ON n.oid = c.relnamespace
    WHERE relkind = 'r'
  ) a
) a
WHERE table_schema like '%'
AND table_name like '%'
AND total_bytes > 0
ORDER BY total_bytes DESC;

SQL Statements Using CPU

Using pg_stat_statements, this query will allocate timing totals as CPU time.

SELECT
    pss.userid,
    pss.dbid,
    pd.datname AS db_name,
    pss.queryid,
    round((pss.total_exec_time + pss.total_plan_time)::numeric, 2) AS total_time,
    pss.calls,
    round((pss.mean_exec_time + pss.mean_plan_time)::numeric, 2) AS mean,
    round((100 * (pss.total_exec_time + pss.total_plan_time) / sum((pss.total_exec_time + pss.total_plan_time)::numeric) OVER ())::numeric, 2) AS cpu_portion_pctg,
    substr(pss.query, 1, 200) short_query
FROM
    pg_stat_statements pss,
    pg_database pd
WHERE
    pd.oid = pss.dbid
    AND query::text NOT LIKE '%FOR UPDATE%'
    /* Add more filters here */
ORDER BY
    (pss.total_exec_time + pss.total_plan_time) DESC
LIMIT 30;

Stats / Vacuum Projection Script

This script looks at the database and table options set for vacuum and analyze to give a report of when vacuum / analyze is projected to run and the last time it did run. This script will give you a good idea how well vacuum and analyze is running:

WITH tbl_reloptions AS (
SELECT
    oid,
    oid::regclass table_name,
    substr(unnest(reloptions), 1,  strpos(unnest(reloptions), '=') -1) option,
    substr(unnest(reloptions), 1 + strpos(unnest(reloptions), '=')) value
FROM
    pg_class c
WHERE reloptions is NOT null)
SELECT
    s.schemaname ||'.'|| s.relname as relname,
    n_live_tup live_tup,
    n_dead_tup dead_dup,
    n_tup_hot_upd hot_upd,
    n_mod_since_analyze mod_since_stats,
    n_ins_since_vacuum ins_since_vac,
    case
      when avacinsscalefactor.value is not null and avacinsthresh.value is not null
        then ROUND(((n_live_tup * avacinsscalefactor.value::numeric) + avacinsthresh.value::numeric),0)
      when avacinsscalefactor.value is null and avacinsthresh.value is not null
        then ROUND(((n_live_tup * (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_insert_scale_factor')) + avacinsthresh.value::numeric),0)
      when avacinsscalefactor.value is not null and avacinsthresh.value is null
        then ROUND(((n_live_tup * avacinsscalefactor.value::numeric) + (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_insert_threshold')),0)
      else ROUND(((n_live_tup * (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_insert_scale_factor')) + (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_insert_threshold')),0) 
    end as ins_for_vac,
    case
      when avacscalefactor.value is not null and avacthresh.value is not null
        then ROUND(((n_live_tup * avacscalefactor.value::numeric) + avacthresh.value::numeric),0)
      when avacscalefactor.value is null and avacthresh.value is not null
        then ROUND(((n_live_tup * (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_scale_factor')) + avacthresh.value::numeric),0)
      when avacscalefactor.value is not null and avacthresh.value is null
        then ROUND(((n_live_tup * avacscalefactor.value::numeric) + (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_threshold')),0)
      else ROUND(((n_live_tup * (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_scale_factor')) + (select setting::numeric from pg_settings where name = 'autovacuum_vacuum_threshold')),0) 
    end as mods_for_vac,
    case
      when avacanalyzescalefactor.value is not null and avacanalyzethresh.value is not null
        then ROUND(((n_live_tup * avacanalyzescalefactor.value::numeric) + avacanalyzethresh.value::numeric),0)
      when avacanalyzescalefactor.value is null and avacanalyzethresh.value is not null
        then ROUND(((n_live_tup * (select setting::numeric from pg_settings where name = 'autovacuum_analyze_scale_factor')) + avacanalyzethresh.value::numeric),0)
      when avacanalyzescalefactor.value is not null and avacanalyzethresh.value is null
        then ROUND(((n_live_tup * avacanalyzescalefactor.value::numeric) + (select setting::numeric from pg_settings where name = 'autovacuum_analyze_threshold')),0)
      else ROUND(((n_live_tup * (select setting::numeric from pg_settings where name = 'autovacuum_analyze_scale_factor')) + (select setting::numeric from pg_settings where name = 'autovacuum_analyze_threshold')),0) 
    end as mods_for_stats,
    case
      when avacfreezeage is not null
        then ROUND((greatest(age(c.relfrozenxid),age(t.relfrozenxid))::numeric / avacfreezeage.value::numeric * 100),2) 
      else ROUND((greatest(age(c.relfrozenxid),age(t.relfrozenxid))::numeric / (select setting::numeric from pg_settings where name = 'autovacuum_freeze_max_age') * 100),2) 
      end as avac_pct_frz,
    greatest(age(c.relfrozenxid),age(t.relfrozenxid)) max_txid_age,
    to_char(last_vacuum, 'YYYY-MM-DD HH24:MI') last_vac,
    to_char(last_analyze, 'YYYY-MM-DD HH24:MI') last_stats,
    to_char(last_autovacuum, 'YYYY-MM-DD HH24:MI') last_avac,
    to_char(last_autoanalyze, 'YYYY-MM-DD HH24:MI') last_astats,
    vacuum_count vac_cnt,
    analyze_count stats_cnt,
    autovacuum_count avac_cnt,
    autoanalyze_count astats_cnt,
    c.reloptions,
    case
      when avacenabled.value is not null
        then avacenabled.value::text
      when (select setting::text from pg_settings where name = 'autovacuum') = 'on'
        then 'true'
      else 'false'
    end as autovac_enabled
FROM
    pg_stat_all_tables s
JOIN pg_class c ON (s.relid = c.oid)
LEFT JOIN pg_class t ON c.reltoastrelid = t.oid
LEFT JOIN tbl_reloptions avacinsscalefactor on (s.relid = avacinsscalefactor.oid and avacinsscalefactor.option = 'autovacuum_vacuum_insert_scale_factor')
LEFT JOIN tbl_reloptions avacinsthresh on (s.relid = avacinsthresh.oid and avacinsthresh.option = 'autovacuum_vacuum_insert_threshold')
LEFT JOIN tbl_reloptions avacscalefactor on (s.relid = avacscalefactor.oid and avacscalefactor.option = 'autovacuum_vacuum_scale_factor')
LEFT JOIN tbl_reloptions avacthresh on (s.relid = avacthresh.oid and avacthresh.option = 'autovacuum_vacuum_threshold')
LEFT JOIN tbl_reloptions avacanalyzescalefactor on (s.relid = avacanalyzescalefactor.oid and avacanalyzescalefactor.option = 'autovacuum_analyze_scale_factor')
LEFT JOIN tbl_reloptions avacanalyzethresh on (s.relid = avacanalyzethresh.oid and avacanalyzethresh.option = 'autovacuum_analyze_threshold')
LEFT JOIN tbl_reloptions avacfreezeage on (s.relid = avacfreezeage.oid and avacfreezeage.option = 'autovacuum_freeze_max_age')
LEFT JOIN tbl_reloptions avacenabled on (s.relid = avacenabled.oid and avacenabled.option = 'autovacuum_enabled')
WHERE
    s.relname IN (
        SELECT
            t.table_name
        FROM
            information_schema.tables t
            JOIN pg_catalog.pg_class c ON (t.table_name = c.relname)
            LEFT JOIN pg_catalog.pg_user u ON (c.relowner = u.usesysid)
        WHERE
            t.table_schema like '%'
            AND (u.usename like '%' OR u.usename is null)
            AND t.table_name like '%'
            AND t.table_schema not in ('information_schema','pg_catalog')
            AND t.table_type not in ('VIEW')
            AND t.table_catalog = current_database())
    AND n_dead_tup >= 0
    AND n_live_tup > 0
ORDER BY 3;

Unused / Rarely Used Indexes

To keep a well run system, it’s important to maintain as few indexes as possible. This will show which indexes have not been recently used. The original version of this was obtained from https://www.databasesoup.com/2014/05/new-finding-unused-indexes-query.html (Josh Berkus)

WITH table_scans AS (
    SELECT
        relid,
        tables.idx_scan + tables.seq_scan AS all_scans,
        (tables.n_tup_ins + tables.n_tup_upd + tables.n_tup_del) AS writes,
        pg_relation_size(relid) AS table_size
    FROM
        pg_stat_all_tables AS tables
    WHERE
        schemaname NOT IN ('pg_toast', 'pg_catalog', 'partman')
),
all_writes AS (
    SELECT
        sum(writes) AS total_writes
    FROM
        table_scans
),
indexes AS (
    SELECT
        idx_stat.relid,
        idx_stat.indexrelid,
        idx_stat.schemaname,
        idx_stat.relname AS tablename,
        idx_stat.indexrelname AS indexname,
        idx_stat.idx_scan,
        pg_relation_size(idx_stat.indexrelid) AS index_bytes,
        indexdef ~* 'USING btree' AS idx_is_btree
    FROM
        pg_stat_user_indexes AS idx_stat
        JOIN pg_index USING (indexrelid)
        JOIN pg_indexes AS indexes ON idx_stat.schemaname = indexes.schemaname
            AND idx_stat.relname = indexes.tablename
            AND idx_stat.indexrelname = indexes.indexname
    WHERE
        pg_index.indisunique = FALSE
),
index_ratios AS (
    SELECT
        schemaname,
        tablename,
        indexname,
        idx_scan,
        all_scans,
        round((
            CASE WHEN all_scans = 0 THEN
                0.0::numeric
            ELSE
                idx_scan::numeric / all_scans * 100
            END), 2) AS index_scan_pct,
        writes,
        round((
            CASE WHEN writes = 0 THEN
                idx_scan::numeric
            ELSE
                idx_scan::numeric / writes
            END), 2) AS scans_per_write,
        pg_size_pretty(index_bytes) AS index_size,
        pg_size_pretty(table_size) AS table_size,
        idx_is_btree,
        index_bytes
    FROM
        indexes
        JOIN table_scans USING (relid)
),
index_groups AS (
    SELECT
        'Never Used Indexes' AS reason,
        *,
        1 AS grp
    FROM
        index_ratios
    WHERE
        idx_scan = 0
        AND idx_is_btree
    UNION ALL
    SELECT
        'Low Scans, High Writes' AS reason,
        *,
        2 AS grp
    FROM
        index_ratios
    WHERE
        scans_per_write <= 1
        AND index_scan_pct < 10
        AND idx_scan > 0
        AND writes > 100
        AND idx_is_btree
    UNION ALL
    SELECT
        'Seldom Used Large Indexes' AS reason,
        *,
        3 AS grp
    FROM
        index_ratios
    WHERE
        index_scan_pct < 5
        AND scans_per_write > 1
        AND idx_scan > 0
        AND idx_is_btree
        AND index_bytes > 100000000
    UNION ALL
    SELECT
        'High-Write Large Non-Btree' AS reason,
        index_ratios.*,
        4 AS grp
    FROM
        index_ratios,
        all_writes
    WHERE (writes::numeric / (total_writes + 1)) > 0.02
    AND NOT idx_is_btree
    AND index_bytes > 100000000
ORDER BY
    grp,
    index_bytes DESC
)
SELECT
    reason,
    schemaname,
    tablename,
    indexname,
    index_scan_pct,
    scans_per_write,
    index_size,
    table_size
FROM
    index_groups
WHERE
    tablename LIKE '%';

Rank Wait Events

This is a great query to rank from most frequent to less frequent wait events being observed on they system in pg_stat_activity. This does not provide historical reference, but a look at the current moment in time:

WITH waits AS (
    SELECT
        wait_event,
        rank() OVER (ORDER BY count(wait_event) DESC) rn
    FROM pg_stat_activity
    WHERE wait_event IS NOT NULL
GROUP BY wait_event ORDER BY count(wait_event) ASC
),
total AS (
    SELECT
        SUM(rn) total_waits
    FROM
        waits
)
SELECT DISTINCT
    h.wait_event,
    h.rn,
    ROUND(100 * h.rn / t.total_waits, 1) percent
FROM
    waits h,
    total t
WHERE
    h.rn >= t.total_waits / 1000
    AND rn <= 14
UNION ALL
SELECT
    'Others',
    COALESCE(SUM(h.rn), 0) rn,
    COALESCE(ROUND(100 * SUM(h.rn) / AVG(t.total_waits), 1), 0) percent
FROM
    waits h,
    total t
WHERE
    h.rn < t.total_waits / 1000
    OR rn > 14
ORDER BY
    2 DESC NULLS LAST;

Tables With Missing FK Indexes

Often, queries will experience poor run times when foreign keys do not have indexes supporting them. This is a good query to show those missing indexes:

WITH y AS (
    SELECT
        pg_catalog.format('%I.%I', n1.nspname, c1.relname) AS referencing_tbl,
        pg_catalog.quote_ident(a1.attname) AS referencing_column,
        t.conname AS existing_fk_on_referencing_tbl,
        pg_catalog.format('%I.%I', n2.nspname, c2.relname) AS referenced_tbl,
        pg_catalog.quote_ident(a2.attname) AS referenced_column,
        pg_relation_size(pg_catalog.format('%I.%I', n1.nspname, c1.relname)) AS referencing_tbl_bytes,
        pg_relation_size(pg_catalog.format('%I.%I', n2.nspname, c2.relname)) AS referenced_tbl_bytes,
        pg_catalog.format($$CREATE INDEX ON %I.%I(%I);$$, n1.nspname, c1.relname, a1.attname) AS suggestion
    FROM
        pg_catalog.pg_constraint t
        JOIN pg_catalog.pg_attribute a1 ON a1.attrelid = t.conrelid
            AND a1.attnum = t.conkey[1]
        JOIN pg_catalog.pg_class c1 ON c1.oid = t.conrelid
        JOIN pg_catalog.pg_namespace n1 ON n1.oid = c1.relnamespace
        JOIN pg_catalog.pg_class c2 ON c2.oid = t.confrelid
        JOIN pg_catalog.pg_namespace n2 ON n2.oid = c2.relnamespace
        JOIN pg_catalog.pg_attribute a2 ON a2.attrelid = t.confrelid
            AND a2.attnum = t.confkey[1]
    WHERE
        t.contype = 'f'
        AND NOT EXISTS (
            SELECT
                1
            FROM
                pg_catalog.pg_index i
            WHERE
                i.indrelid = t.conrelid
                AND i.indkey[0] = t.conkey[1]))
SELECT
    referencing_tbl,
    referencing_column,
    existing_fk_on_referencing_tbl,
    referenced_tbl,
    referenced_column,
    pg_size_pretty(referencing_tbl_bytes) AS referencing_tbl_size,
    pg_size_pretty(referenced_tbl_bytes) AS referenced_tbl_size,
    suggestion
FROM
    y
ORDER BY
    referencing_tbl_bytes DESC,
    referenced_tbl_bytes DESC,
    referencing_tbl,
    referenced_tbl,
    referencing_column,
    referenced_column;

Blocking Lock Tree

postgres.ai is a great place to get some observability queries and this is one of my favorites:

with recursive activity as (
  select
    pg_blocking_pids(pid) blocked_by,
    *,
    age(clock_timestamp(), xact_start)::interval(0) as tx_age,
    -- "pg_locks.waitstart" – PG14+ only; for older versions:  age(clock_timestamp(), state_change) as wait_age,
    age(clock_timestamp(), (select max(l.waitstart) from pg_locks l where a.pid = l.pid))::interval(0) as wait_age
  from pg_stat_activity a
  where state is distinct from 'idle'
), blockers as (
  select
    array_agg(distinct c order by c) as pids
  from (
    select unnest(blocked_by)
    from activity
  ) as dt(c)
), tree as (
  select
    activity.*,
    1 as level,
    activity.pid as top_blocker_pid,
    array[activity.pid] as path,
    array[activity.pid]::int[] as all_blockers_above
  from activity, blockers
  where
    array[pid] <@ blockers.pids
    and blocked_by = '{}'::int[]
  union all
  select
    activity.*,
    tree.level + 1 as level,
    tree.top_blocker_pid,
    path || array[activity.pid] as path,
    tree.all_blockers_above || array_agg(activity.pid) over () as all_blockers_above
  from activity, tree
  where
    not array[activity.pid] <@ tree.all_blockers_above
    and activity.blocked_by <> '{}'::int[]
    and activity.blocked_by <@ tree.all_blockers_above
)
select
  pid,
  blocked_by,
  case when wait_event_type <> 'Lock' then replace(state, 'idle in transaction', 'idletx') else 'waiting' end as state,
  wait_event_type || ':' || wait_event as wait,
  wait_age,
  tx_age,
  to_char(age(backend_xid), 'FM999,999,999,990') as xid_age,
  to_char(2147483647 - age(backend_xmin), 'FM999,999,999,990') as xmin_ttf,
  datname,
  usename,
  (select count(distinct t1.pid) from tree t1 where array[tree.pid] <@ t1.path and t1.pid <> tree.pid) as blkd,
  format(
    '%s %s%s',
    lpad('[' || pid::text || ']', 9, ' '),
    repeat('.', level - 1) || case when level > 1 then ' ' end,
    left(query, 1000)
  ) as query
from tree
order by top_blocker_pid, level, pid;


Our Customers

Industries